1 Exercise 1

#settings and libraries
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
library(patchwork)
# Read in data
DataAnalyst <- read_csv("data/DataAnalyst.csv")
BusinessAnalyst <- read_csv("data/BusinessAnalyst.csv")
DataScientist <- read_csv("data/DataScientist.csv")

1.1 a.

Scrutinize the data to assess structure and quality. Are there any improbable or problematic entries? Provide a summary of checks performed and edit the data so entries are valid and meaningful where editing is reasonable to do.

# Mutate a new column for the job Category and then Combine the three data sets
DataAnalyst <- DataAnalyst %>% 
  mutate(Job_Category = "DataAnalyst")

BusinessAnalyst <- BusinessAnalyst %>% 
  mutate(Job_Category = "BusinessAnalyst")

DataScientist <- DataScientist %>% 
  mutate(Job_Category = "DataScientist")

Combined_Job <- DataAnalyst %>% 
  rbind(BusinessAnalyst) %>% 
  rbind(DataScientist)

#replace -1 to NA
Combined_Job[Combined_Job== -1] <- NA

#visualize the missings
visdat::vis_dat(Combined_Job, palette = "cb_safe")+
  ggtitle("Visualization of missing data")

Based on the plot, it is found that there are a lot of missing data especially in Competitors, Easy Apply, Rating and Founded column.

# Separate the salary estimate into "Salary_Estimate" and "Salary_Est_Provider" and remove the missing value (-1) and bracket in provider
Combined_Job_clean <- Combined_Job %>% 
  separate(`Salary Estimate`, 
           into = c("Salary_Estimate", "Salary_Est_Provider"), 
           sep = "\\(") %>%
  separate(Salary_Est_Provider,
           into = c("Salary_Est_Provider", "remove"),
           sep = "est.") %>% 
  select(-remove) %>% 
  na.omit()

# Remove the "\n rating from Company Name"
Combined_Job_clean <- Combined_Job_clean %>% 
  filter(`Company Name` != 1) %>% 
  separate(`Company Name`, into = c("Company_Name","remove"), sep = "\n" ) %>%
  select(-remove)

# Separate the Location variable into Location and Location_State
Combined_Job_clean <- Combined_Job_clean %>% 
  separate(Location,
           into = c("Location", "Location_State"),
           sep = ", ") %>% 
   mutate(Location_State = ifelse(Location_State == "Arapahoe", "CO", Location_State),
          Location_State = ifelse(Location_State == "Los Angeles", "CA", Location_State))

# Separate the Headquarters variable into Headquarters and Headquarters_State
Combined_Job_clean <- Combined_Job_clean %>% 
  separate(Headquarters,
           into = c("Headquarters", "Headquarters_Location"),
           sep = ", ")

# Remove missing (-1, 1, Unknown) and the word "employees"from the Size
Combined_Job_clean <- Combined_Job_clean %>% 
  filter(Size != 1 & Size != "Unknown") %>% 
  separate(Size, into = c("Size", "remove"), sep = " e") %>% 
  select(-remove)

1.2 b.

How many job listings provide salary (intervals) in a per hour basis?

#unique(Combined_Job_clean$Salary_Estimate) 
  Combined_Job_clean %>% 
  filter(str_detect(Salary_Estimate, "Per")) %>% 
  group_by(`Job Title`) %>% 
  tally()
## # A tibble: 0 × 2
## # … with 2 variables: Job Title <chr>, n <int>

There are 20 job listings provide salary (intervals) in a per hour basis.

1.3 c. 

We want to investigate what the differences are between the job listings for those under different classification, i.e. business analytics, data analytics and data science. Compare across the classifications using appropriate graphics the:

1.3.1 1. salary intervals (study the minimum and maximum of the intervals)

# Extract min and max salary
Salary_clean <- Combined_Job_clean %>% 
  filter(!str_detect(Salary_Estimate, "Per"))

 Salary_Interval <- str_extract_all(Salary_clean$Salary_Estimate, "\\d+") %>% 
    unlist() %>% 
    as.numeric()
 
# Mutate Salary interval by Salary_Min and Salary_Max
 Combined_Job_clean <- Salary_clean %>% 
   mutate(Salary_Min = (Salary_Interval[seq(1,length(Salary_Interval), 2)])*1000,
          Salary_Max = (Salary_Interval[seq(0,length(Salary_Interval), 2)])*1000)
 
 Combined_Job_clean %>% 
   group_by(Job_Category) %>% 
   pivot_longer(cols = c("Salary_Min", "Salary_Max"),
                         names_to = "Salary_Interval",
                         values_to = "Salary_Range") %>% 
   ggplot(aes(x = Job_Category,
              y = Salary_Range,
              color = Salary_Interval))+ 
   geom_boxplot()+
   theme_bw()+
   #facet_wrap(~Job_Category)+
   labs(x = "Job Listings",
        y = "Salary",
        title = "Salary Intervals by Job Listings",
        color = "Salary Interval")+
   scale_colour_discrete(labels = c("Max.Salary", "Min.Salary"))

From the above, it is found that data scientist had higher mean income in both maximum and minimum salary compare to the other two category. Moreover, the larger variation in salary is also observed in this category.
On the other hand, the mean and variation of maximum and minimum salary of Business Analyst and Data Analyst are pretty close.

1.3.2 2. location of the job (study by State)

# Count the number of job by State and Job Listings
Combined_Job_clean %>% 
  filter(Location_State != "United Kingdom") %>% 
   group_by(Location_State, Job_Category) %>% 
  tally() %>% 
   ggplot(aes(x = reorder(Location_State, desc(n)),
              y = n,
              fill = Job_Category))+ 
   geom_col(position = "dodge")+
   theme_bw()+
   labs(x = "State",
        y = "Number of Jobs",
        title = "Locations of job by State",
        fill = "Job Listings")+
   theme(legend.position =  "bottom")

Based on the plot above, it is found that the majority number of jobs are located in Texas(TX) and California(CA). Moreover, the number of Business Analyst and Data Scientists are higher in most of the states like Illinois(IL), Pennsylvania(PA) and Arizona(AZ),etc which provide all of the three job listings.
However, it is found that there are several states like Colorado(CO), North Carolina(NC), Washington(WA) etc. provide data analyst jobs only.
On the other hand, New York State is the only city which has the higher number of data analyst compare with the remaining two when all three job listings are available.

1.3.3 3. company size

# Count the number of company size by Job Listings
Combined_Job_clean %>% 
  group_by(Size, Job_Category) %>% 
  tally() %>%
    ggplot(aes(x = n,
              y = reorder(Size,n),
              fill = Job_Category))+ 
   geom_col(position = "dodge")+
   theme_bw()+
   labs(x = "Number of Job Listings",
        y = "Size",
        title = "Number of Job listings by Size",
        fill = "Job Listings")+
   theme(legend.position =  "bottom")

Based on the plot, it is found that the number of job listings are distributed quite evenly within different company sizes.
Also, the Largest size of company (10000+ employees) in this data has highest number of job listings when considering three Job listings together. Whereas the lowest is found in Company size with 5001 to 10000 employees.
Moreover, it is found that the highest number of job listings is found in Data Scientist category whereas the lowest is found in Data Analyst.

1.3.4 4. industry

Industry_BA <- Combined_Job_clean %>%
  filter(Industry != -1) %>% 
  filter(Job_Category == "BusinessAnalyst") %>% 
  group_by(Industry) %>%
  tally() %>% 
  arrange(desc(n)) %>% 
  top_n(10) %>% 
  ggplot(aes(x = n,
             y = reorder(Industry, n)))+
  geom_col(fill = "#F8766D")+
  theme_bw()+
  labs(x = "Number of Jobs",
       y = "Industry",
       title = "Number of Jobs in Business Analyst Category")

Industry_DA <- Combined_Job_clean %>%
  filter(Industry != -1) %>% 
  filter(Job_Category == "DataAnalyst") %>% 
  group_by(Industry) %>%
  tally() %>% 
  arrange(desc(n)) %>% 
  top_n(10) %>% 
  ggplot(aes(x = n,
             y = reorder(Industry, n)))+
  geom_col(fill = "#00BA38")+
  theme_bw()+
  labs(x = "Number of Jobs",
       y = "Industry",
       title = "Number of Jobs in Data Analyst Category")

Industry_DS <- Combined_Job_clean %>%
  filter(Industry != -1) %>% 
  filter(Job_Category == "DataScientist") %>% 
  group_by(Industry) %>%
  tally() %>% 
  arrange(desc(n)) %>% 
  top_n(10) %>% 
  ggplot(aes(x = n,
             y = reorder(Industry, n)))+
  geom_col(fill = "#619CFF")+
  theme_bw()+
  labs(x = "Number of Jobs",
       y = "Industry",
       title = "Top 5 Number of Jobs in Data Scientist Category")

Industry_BA/Industry_DA/Industry_DS

As the total number of industries is above 100, for simplification, the top 10 number of jobs within the industries are taken to compare the differences between the three job listings.
It is found that in the IT Services followed by Staffing & Outsourcing are the highest within these three Job Listings. Whereas, most of the remainings are also shown in the three Job Listings but in different order.
However, there are particular industries with the top 10 numbers only found in specific Job Category. For example, insurance carriers industry is only found in Business Analyst Category, Advertising & Marketing is only found in Data Analyst Category. Meanwhile, Biotech & Pharmaceuticals is only found in Data Scitentist Category.

1.3.5 5. sector

Combined_Job_clean %>% 
   filter(Industry != -1) %>% 
  group_by(Sector, Job_Category) %>% 
  tally() %>% 
  ggplot(aes(x = n,
             y = reorder(Sector,n),
             fill = Job_Category)) +
  geom_col(position = "dodge")+
  theme_bw()+
   labs(x = "Number of Jobs",
        y = "Sectors",
        title = "Number of Jobs by Sectors",
        fill = "Job Listings")+
   theme(legend.position =  "bottom")

From the above, the highest number of jobs is found in Information Technology followed by Business Services Sector. While the remainings vary by the sectors.
Moreover, it is found that Number of Jobs in Data Analyst seem to be relatively lower than the other two in most of the Sectors.

1.4 d. 

Your friend suspects that if an employer provides a salary range for the job, the salary is large and hence more attractive to potential candidates. Investigate this claim. Your investigation should be supported by graphics.

Combined_Job_clean %>%
  pivot_longer(cols = c(Salary_Min, Salary_Max),
               names_to = "Salary_Interval",
               values_to = "Salary_Range") %>%
  ggplot(aes(x = Salary_Interval,
             y = Salary_Range,
             color = Salary_Interval)) +
  geom_violin() +
  theme_bw()+
  facet_wrap(~Job_Category)+
  labs(x = "",
       y = "Salary Range",
       title = "Salary Distribution by Interval",
       color = "Salary Interval")+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())+
    scale_colour_discrete(labels = c("Max.Salary", "Min.Salary"))

From the above, it is found that the claim is false as the upper part of the upper part of the violin plot which refers to the large salary is the narrowest. This means that the number is actually lower.
Moreover it is found that the bottom part of the plot which refers to lower salary is wider. This means that there are more number of people in lower salary intervals.

1.5 e.

Is the location (via by State) associated with the salary and/or sector? Show graphics to best your conclusion.

Combined_Job_clean %>% 
  filter(Location_State != "United Kingdom") %>% 
    pivot_longer(cols = c(Salary_Min, Salary_Max),
               names_to = "Salary_Interval",
               values_to = "Salary_Range") %>% 
  ggplot(aes(x = Salary_Range,
             y = reorder(Location_State, Salary_Range),
             color = Salary_Interval))+
  geom_boxplot()+
  theme_bw()+
   labs(x = "Salary Range",
        y = "State",
       title = "Salary range by Location",
       color = "Salary Interval")+
    scale_colour_discrete(labels = c("Max.Salary", "Min.Salary"))

Based on the plot, it is found that salary indeed varies by States. The highest salary group is found in California(CA), followed by New York(NY) and Ohio(OH). Whereas the lowest were found in Utah(UT) follow by Georgia(GA) and Iiana(IN).

Combined_Job_clean %>% 
  filter(Sector != -1) %>% 
    pivot_longer(cols = c(Salary_Min, Salary_Max),
               names_to = "Salary_Interval",
               values_to = "Salary_Range") %>% 
  ggplot(aes(x = Salary_Range,
             y = reorder(Sector, Salary_Range),
             color = Salary_Interval))+
  geom_boxplot()+
  theme_bw()+
   labs(x = "Salary Range",
        y = "Sectors",
        title = "Salary range by Sector",
        color = "Salary Interval")+
    scale_colour_discrete(labels = c("Max.Salary", "Min.Salary"))

Based on the boxplot, it is found that the mean value of the maximum salary and minimum salary are close to each others within the different Sectors. Except from Travel & Tourism, Restaurants, Bars & Food Services as well as Mining & Metals which the mean maximum salary are slightly lower than the other sectors.
On the other hand, it is found that the mean maximum and minimum salary of Mining & Metals is approximately equals to the lower quantile of the salary Interval.

However, as there are lots of outliers within the Sectors, the mean may not be accurate enough to reflect the salary. Taking median for comparison may be more appropriate for comparison.

2 Exercise 2

#settings and libraries
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(fitzRoy)
library(tidyverse)
library(patchwork)
library(plotly)
library(shiny)
load("data/aflw.rda")
load("data/aflw_scags.rda")

2.1 a.

  • Answer these questions from the data. How many teams in the competition? How many players? How many rounds in the competition? (Check your answers after calculating using internet search.)
unique(aflw$team.name) %>% 
  as.data.frame() %>% 
  tally() %>% 
  rename(`Number of Teams` = "n")
##   Number of Teams
## 1              14

Based on the calculations, there are totally 14 teams.
Based on wiki-AFLW, there were 8 teams for the first season. Later it expanded to 10 in 2019 and to 14 teams in 2020.

aflw <- aflw %>% 
  mutate(player.fullName = paste0(aflw$player.givenName, " ", aflw$player.player.player.surname)) 

unique(aflw$player.fullName)%>% 
  as.data.frame() %>% 
  tally() %>% 
  rename(`Number of Players` = "n")
##   Number of Players
## 1               372

Since there may be repetitions in Last name or given name, the names were combined to full name before calculation. Based on the calculations, there are totally 372 players.
According to wiki-AFLW: Rules, there should be 16 players on field and 5 interchange players which make a total of 21 players for each team. Hence, there should be \(21\times14=294\) players. However, for each team, there are more than 21 players and they can send different players for each match. Hence, this explains the remaining \(372-294=78\) players.

  unique(aflw$round.name) %>% 
  as.data.frame() %>% 
  tally() %>% 
    rename(`Number of Rounds` = "n")
##   Number of Rounds
## 1                7

Based on the calculations, there are totally 7 rounds.
Based on AFLW-STATS, there were 8 rounds in 2017, 2018, 9 in 2019, 7 in 2020 due to COVID-19 and 10 in 2021.

2.2 b.

  • The 2020 season was interrupted by COVID, so there was no winning team. Make an appropriate plot of the goals by team and suggest which team might have been likely to win if the season had played out.
goal_no <- aflw %>% 
  group_by(team.name) %>% 
  summarise(total_goals = sum(goals)) %>% 
  ggplot(aes(x = total_goals,
             y = reorder(team.name, total_goals)))+
  geom_col()+
  labs(x = "Number of Goals",
       y = "Team Name",
       title = "Total number of Goals")+
  theme_bw()

goal_acc <- aflw %>% 
  group_by(team.name) %>% 
  summarise(goal_accuracy = mean(goalAccuracy)) %>% 
  ggplot(aes(x = goal_accuracy,
             y = reorder(team.name, goal_accuracy)))+
  geom_col()+
  labs(x = "Number of Goals",
       y = "Team Name",
       title = "Average Goal Accuracy")+
  theme_bw()

goal_no|goal_acc

From the above, Team Kangaroos is most likely to win if the season had played out, the reasons are as follow. Based on the total number number of goals team Fremantle has highest number of goals followed closely by team Kangaroos. While the average accuracy of Team Kangaroos followed closely by team Calton. When considering these two data together, it draws the conclusion that Team Kangaroos is most likely to win.

2.3 c.

  • If you were to make a pairs plot of the numeric variables, how many plots would you need to make? (DON’T MAKE THE PLOT!!!)

There are totally 34 variables with one repetition. After reducing the repeated variables, there are 33 remainings. And the number of paired plots can be calculated by unordered without repetitions. Hence, the number of plots is \(\frac{33!}{2!(33-2)!}=528\).

2.4 d. 

  • Summarise the players, by computing the means for all of the statistics. On this data, one pair of variables variables has an L-shaped pattern. (See the slides from week 7 if you need a reminder what this shape is.) Use scagnostics to find the pair. Make the plot, report the scagnostic used. Write a sentence to explain the relationship between the two variables, in terms of players skills.
aflw_mean <- aflw %>% 
  group_by(player.fullName) %>% 
  summarise(across(where(is.numeric), ~mean(.)))
aflw_scags %>% 
  select(Var1,Var2, stringy,striated) %>% 
  arrange(desc(striated)) %>% 
  head(10)
## # A tibble: 10 × 4
## # Groups:   Var1 [6]
##    Var1                        Var2               stringy striated
##    <fct>                       <fct>                <dbl>    <dbl>
##  1 disposalEfficiency          bounces              0.943    0.829
##  2 metresGained                bounces              0.947    0.810
##  3 disposalEfficiency          hitouts              0.929    0.803
##  4 dreamTeamPoints             bounces              0.926    0.780
##  5 hitouts                     bounces              0.890    0.759
##  6 metresGained                hitouts              0.923    0.759
##  7 goalAssists                 disposalEfficiency   0.926    0.752
##  8 dreamTeamPoints             hitouts              0.906    0.746
##  9 clearances.centreClearances disposalEfficiency   0.911    0.741
## 10 metresGained                goalAssists          0.931    0.733

As stringy and striated refer to the shape and skinness of the plot. These two are chosen for identifying the L-shape from the plot.

aflw_mean %>% 
  ggplot(aes(x = hitouts, y = bounces)) +
  geom_point()+
  labs(x = "Hit-Outs",
       y = "Bounces",
       title = "L Shaped plot")+
  theme_bw()

From the vairable hitouts and bounces, L-shaped is observed. From the plot, it is found that most of the players have either of these skills. It is because, based on AFLW-stats-glossary, hitouts refer to “Knocking the ball out of a ruck contest following a stoppage with clear control, regardless of which side wins the following contest at ground level” which is totally opposite to bounces. Hence, it explains the L-shape of the plot.

2.5 e.

  • Find a pair of variables that exhibit a barrier. Plot it and report the scagnostic used. Write sentence explaining the relationship.
aflw_scags %>%
  select(Var1, Var2, outlying, clumpy) %>% 
  arrange(desc(outlying)) %>% 
  head(10)
## # A tibble: 10 × 4
## # Groups:   Var1 [5]
##    Var1               Var2             outlying clumpy
##    <fct>              <fct>               <dbl>  <dbl>
##  1 disposalEfficiency hitouts             0.837  0.973
##  2 metresGained       hitouts             0.733  0.984
##  3 dreamTeamPoints    hitouts             0.719  0.902
##  4 metresGained       bounces             0.619  0.971
##  5 disposalEfficiency bounces             0.571  0.980
##  6 hitouts            disposals           0.520  0.917
##  7 dreamTeamPoints    bounces             0.520  0.833
##  8 hitouts            totalPossessions    0.487  0.963
##  9 rebound50s         bounces             0.454  0.750
## 10 disposalEfficiency behinds             0.450  0.993

As outlying and clumpy refer to the skewness and outlying of the plot. These two are chosen for identifying the barriers from the plot.

aflw_mean %>%
  ggplot(aes(x = disposalEfficiency , y =  hitouts  ))+
  geom_point()+
  labs(x = "Disposal Efficiency",
       y = "Hit-Outs",
       title = "Barriers plot")+
  theme_bw()

Based on the barriers plot, it is found that there are barriers when Hit-Outs are above 15 or 5 and Disposal Efficiency is below 25 or above 30, respectively. As disposal efficiency refers to the efficiency that the players are able to get rid of the ball. It is less likely for players to hold the balls for too long or short period of time. This explains the lack of dots when disposal efficiency is either high or low. Moreover, as mentioned in d, it is less likely for the players to have many times of clear control of ball. On the other hand, it is found that most of the players are having no hit-outs and 25-80% efficiency rate which hey are more likely to have hit-outs.

2.6 f. 

  • Writing code similar to that in lecture 7B, make an interactive plotly parallel coordinate plot of the scagnostics. You can also refer to the plotly website to work out some of the difficult parts. There are two pieces that are really important to have: (1) scale on each axis needs to be 0-1, not individual variable range, (2) the text outputted when traces are selected should include the pair of variables with that set of scagnostic values. Then answer these questions:
ui <- fluidPage(
  plotlyOutput("parcoords"),
  verbatimTextOutput("data"))

server <- function(input, output, session) {
  aflw_numeric <- aflw_scags[,3:15]
  
  output$parcoords <- renderPlotly({
    dims <- Map(function(x, y) {
      list(values = x, range = c(0,1), label = y)
    }, aflw_numeric, names(aflw_numeric), USE.NAMES = FALSE)
    plot_ly(type = 'parcoords', dimensions = dims, 
            source = "pcoords") %>% 
      layout(margin = list(r = 30)) %>%
      event_register("plotly_restyle")
  })
  
  ranges <- reactiveValues()
  observeEvent(event_data("plotly_restyle", source = "pcoords"),
  {
    d <- event_data("plotly_restyle", source = "pcoords")

    dimension <- as.numeric(stringr::str_extract(names(d[[1]]),
                                                 "[0-9]+"))
    if (!length(dimension)) return()
    dimension_name <- names(aflw_numeric)[[dimension + 1]]

    info <- d[[1]][[1]]
    
    ranges[[dimension_name]] <- if (length(dim(info)) == 3) {
      lapply(seq_len(dim(info)[2]), function(i) info[,i,])
    } else {
      list(as.numeric(info))
    }
  })
  
  aflw_selected <- reactive({
    keep <- TRUE
    for (i in names(ranges)) {
      range_ <- ranges[[i]]
      keep_var <- FALSE
      for (j in seq_along(range_)) {
        rng <- range_[[j]]
        keep_var <- keep_var | dplyr::between(aflw_scags[[i]], 
                                        min(rng), max(rng))
      }
      keep <- keep & keep_var
    }
    aflw_scags[keep, ]
  })
  
  output$data <- renderPrint({
    tibble::as_tibble(aflw_selected())
  })
}

shinyApp(ui, server)

Noted that the shiny is not available in index.Rmd, please render it in Q2.Rmd

2.6.1 i.

  • Summarise the relationships between the scagnostics, in terms of positive and negative association, outliers, clustering.

In terms of positive and negative association. It refers to the variables moving in the same direction. For instance, when \(X_1\) increases 1 unit \(X_2\) increases to a specific unit. While opposite direction refers to the variable increase when another variable decrease.
Based on the plot, it is found that stringy, clumpy, skewed, monotonic have the positive association due to their increasing trend with the range around (0.5 - 1).
While striated and splines have negative association to the other variables. As they go opposite direction(i.e. decrease) ranged (0.8 - 0).
There are few outlyings, which are not in the majority range, found in different variables. For example, in striated2, clumpy2, outlying. The clustering refers to data concentrated by specific ranges. For instance, ranged 0 and (0.3-0.7) in convex. 0, (0.3-0.6), 1 in skinny.

2.6.2 ii.

  • Pairs that have high values on convex (non-zero) tend to have what type of values on outlying, stringy, striated, skewed, skinny and splines?

Having high values on convex, means:
Outlying tends to be lower and concentrate at range (0.1 - 0.3).
String tends to have lower values.
Striated tends to to have lower values.
Skewed tends to to have higher values.
Kinny tend to be in ranged (0.2 - 0.4) without sepcific trends.
Splines tend to spread over range (0 - 1) which is from minimum to maximum without obvious trend observed.

2.6.3 iii.

  • Pairs of variables that have high values on skewed tend to have what type of values on outlying, stringy, striated, and splines?

Having high values on skewed, means:
Outlying, String and striated tend to be higher as most of them move in the same direction.
Splines tends to be lower however, the trend is not that obvious as there are values observed spreading over.

2.6.4 iv.

  • Identify one pair of variables that might be considered to have an unusual combination of scagnostic values, ie is an outlier in the scagnostics.

The pair of variables are hitouts in Var1 and bounces in Var2. It is because this pair is the only pair that have different direction towards the other pairs in scanostics variable Striated. The the other pairs are having downwards trends while this pair is having upwards trend.

3 Exercise 3

#settings and libraries
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
library(nullabor)
library(patchwork)
#read data
control <- read_csv("data/control_data.csv")
experiment <- read_csv("data/experiment_data.csv")

3.1 a.

Make a plot (or two) of the data that provides a suitable comparison between the pageviews of the two groups relative to time. Write a sentence comparing the two groups.

#mutate new column: Group control and experiment
control <- control %>% 
 mutate(Group = "control") 

experiment <- experiment %>% 
  mutate(Group = "experiment")

#Combine the data sets, separate weekday from date and mutate the year 2018
combined <- control %>% 
  rbind(experiment) %>% 
  separate(Date, c("Weekday", "Date"), ",") %>% 
  mutate(Year = "2018")

#paste year to date
combined <- combined %>% 
  mutate(Date = paste0(combined$Date," ", combined$Year))

# cleaned Date variable
combined_clean <- combined %>% 
  mutate(Date = as.Date(Date, " %b %d %Y"))
combined_clean %>% 
  ggplot(aes(x = Date,
             y = Pageviews,
             color = Group))+
  geom_line()+
  geom_point()+
  theme_bw()+
  labs(title = "Trend of Pageviews by date")

From the plot, it is found that the Pageviews of the two groups are basically following the similar trend.

3.2 b.

Make an appropriate transformation of the data, and plot, to examine whether there is a difference in Clicks, summarising what you learn

combined_clean %>% 
  na.omit() %>% 
  ggplot(aes(x = Group,
             y = Clicks,
             color = Group))+
  geom_violin()+
  theme_bw()+
  labs(title = "Violin plot of Clicks")

Based on the violin plot, the variation of clicks in control group is larger than control group. However, the number of clicks is higher in control group as there are wider shape at the upper part of the shape which means that the digital marketing measures might be lack of effectiveness.

3.3 c. 

Repeat (b) to check if there is a difference between the groups in Enrollments, summarising what you learn.

combined_clean %>% 
  na.omit() %>% 
  ggplot(aes(x = Group,
             y = Enrollments,
             color = Group))+
  geom_violin()+
  theme_bw()+
  labs(title = "Violin plot of Enrollments")

Based on the violin plot, it is found that there are higher number of enrollments in the control group which means that the marketing measures are not that effective to the erollments of the free trial of the website.

3.4 d. 

Repeat (b) to check if there is a difference between the groups in Payments, summarising what you learn.

combined_clean %>% 
  na.omit() %>% 
  ggplot(aes(x = Group,
             y = Payments,
             color = Group))+
  geom_violin()+
  theme_bw()+
  labs(title = "Violin plot of Payments")

According to the violin plot, it is found that the variation of experiment group is higher. Moreover, there are not obvious differences observed on the payments between two groups as control group is only slightly higher in payments. This means even there are some advertisement made in experiment group, the number of payments do not have significant change. Thus, the measure in experiment group is not effective.

3.5 e.

The variables can be considered to monitor the flow of visitor traffic to the site. Pageviews is the number of visitors to the site, and some of these will click on the page. From those that click on the site some will enrol, and some of those that enrol will continue to pay for the service. Make a suitable plot to examine the flow of traffic, so that you can compare the flow between the two groups.

combined_clean %>% 
   na.omit() %>% 
  ggplot(aes(x = Pageviews,
             y = Clicks,
             color = Group))+
  geom_smooth(se = FALSE)+
  geom_point()+
  theme_bw()+
  ggtitle("Flow from Pageviews to Clicks")

From the plot, it is found that, in the case of flow from pageviews to clicks, when the pageviews increase, clicks increase following the same trend. Moreover, the numbers are very close to each other in control and experiment groups.

combined_clean %>% 
   na.omit() %>% 
  ggplot(aes(x = Clicks,
             y = Enrollments,
             color = Group))+
  geom_smooth(se = FALSE)+
  geom_point()+
  theme_bw()+
  ggtitle("Flow from Clicks to Enrollments")

On the other hand, when observing flow from clicks to enrollment, it this found that they are basically following the same trend. However, the slope seem to be more steep in experiment groups which indicate higher variation. In addition, it is observed that, the number of enrollments are mostly higher in control group which means that the digital marketing measures might be lack of effectiveness.

combined_clean %>% 
   na.omit() %>% 
  ggplot(aes(x = Enrollments,
             y = Payments,
             color = Group))+
  geom_smooth(se = FALSE)+
  geom_point()+
  theme_bw()+
   ggtitle("Flow from Enrollments to Payments")

From the above, it is found that the flow from enrollments to payments, the number of control and experiment group follow the same trend when the enrollment is below 200. However, there’s an increase from 200 in control group when experiment is in opposite direction. This indicates that the digital marketing measures might be ineffective.

3.6 f. 

Check what you learn about the difference in flow of traffic between control and experiment using a lineup plot.

set.seed(330)
pc_new <- lineup(method = null_permute("Clicks"), true = combined_clean, n = 10)

pc_new %>% 
   na.omit() %>% 
  ggplot(aes(x = Pageviews,
             y = Clicks,
             color = Group))+
  geom_smooth(se = FALSE)+
  geom_point()+
  facet_wrap(~.sample, nrow = 2)+
  theme_bw()+
  theme(axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        legend.position = "none")

From the above, it is found that plot 1 which is obviously different from the others has the most fitted line to the data among these 10 plots. However, there are not obvious evidence showing whether the red or the green line performs better.

set.seed(330)
ce_new <- lineup(method = null_permute("Enrollments"), true = combined_clean, n = 10)

ce_new %>% 
   na.omit() %>% 
  ggplot(aes(x = Clicks,
             y = Enrollments,
             color = Group))+
  geom_smooth(se = FALSE)+
  geom_point()+
  facet_wrap(~.sample, nrow = 2)+
  theme_bw()+
  theme(axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        legend.position = "none")

From the above, only plot 1 has the similar trend on the movement within two lines while the others mostly have different patterns.
On the other hand, it is found that in most of the cases, the red line has better performance than the green line which proves the conclusion from part e.

set.seed(330)
ep_new <- lineup(method = null_permute("Payments"), true = combined_clean, n = 10)

ep_new %>% 
   na.omit() %>% 
  ggplot(aes(x = Enrollments,
             y = Payments,
             color = Group))+
  geom_smooth(se = FALSE)+
  geom_point()+
  facet_wrap(~.sample, nrow = 2)+
  theme_bw()+
  theme(axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        legend.position = "none")

From the above, it seems that green line has better performance as it is mostly better than the red line. This draws an opposite conclusion mentioned in part e.

4 Exercise 4

#settings and libraries
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(tidyverse)
library(nullabor)
library(broom)
library(DT)
#read data
Cholesterol <- read.csv("data/Cholesterol_R.csv")
visinf_results <- read_csv("data/visinf-results.csv")

4.1 a.

Conduct a two-sample t-test and a Wilcoxon rank sum test to compare the mean cholesterol reduction between the margarine brands after 4 weeks. What graphics best compares these measurements across the brands? What do you conclude from the results of the tests and your graphics?

#mutate diff column for the difference after 4 weeks
Cholesterol <- Cholesterol %>%
  mutate(Difference = Before - After4weeks)

#t-test
with(Cholesterol, 
  t.test(Difference[Margarine=="A"], Difference[Margarine=="B"])) %>% 
  tidy() %>% 
  rename(mean.a = estimate1,
         mean.b = estimate2) %>% 
  mutate_if(is.numeric, ~round(., 4)) %>% 
  datatable()

Based on the results from the t-test, it is found that the difference of mean of margarine a and b is approximately 0.16. The p-value also indicates that the null hypothesis can be rejected in confidence level is 95%.

#wilcoxon-test
with(Cholesterol, 
  wilcox.test(Difference[Margarine=="A"], Difference[Margarine=="B"], conf.int = TRUE,
              corr = TRUE)) %>% 
  tidy() %>% 
  mutate_if(is.numeric, ~round(., 4)) %>% 
  datatable() 

Based on the results from the wilcoxon test, it is found that the difference of mean of margarine a and b is approximately 0.17 which is very close to the results from the t-test. The p-value also indicate that the null hypothesis can be rejected in 5% significant level.

Cholesterol%>%
  ggplot(aes(x = Margarine,
             y = Difference,
             color = Margarine)) + 
  geom_boxplot() +
  labs(x = "Margarine", 
       y = "Cholesterol Reduction",
       title = "Cholesterol Reduction by Margarine Brands") +
  theme_bw()

Since there are not many outliers, it is easier to compare their difference by mean value. And it is obvious that the cholesterol reduction of margarine B is higher than A. It is because B’s lower quantile is even higher than A’s upper quantile.

4.2 b.

Construct a lineup for visual inference of the plot in (a). Ask your buddy, family and friends select a plot that looks different in the line-up an record their choices (the more people, the better). Based on your responses calculate the p-value from your visual inference.

set.seed(330)
Cholesterol_lineup <- lineup(method = null_permute("Difference"), true = Cholesterol, n = 10)

Cholesterol_lineup %>% 
    ggplot(aes(x = Margarine,
               y = Difference,
             color = Margarine)) + 
  geom_boxplot()+
  theme_bw()+
  facet_wrap(~.sample, ncol = 5)+
  theme(legend.position = "none",
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank())

#decrypt("bhMq KJPJ 62 sSQ6P6S2 uu"): "True data in position  1"
# 6 out of 10 people were able to pick the right plot
paste0("The p-value is: ",round((1 - pbinom(5 - 1, 10, 1/10)),4))
## [1] "The p-value is: 0.0016"
paste0("The power of visualisation is: ", 5/10)
## [1] "The power of visualisation is: 0.5"

4.3 c. 

You construct a set of lineups to compare the cholesterol reduction between the margarine brands after 8 weeks. The visual statistics used for the three lineups are: violin plot, boxplot, and dotplot. The lineups are shown to independent participant such that each participant will only see one lineup. The file visinf-results.csv contains the result from your visual inference experiment; the data dictionary is provided in the table below. Use this result to answer the following.

4.3.1 i. Calculate the power of each lineup and provide the mean power, and its standard deviation, for each visual statistic.

power <- visinf_results %>% 
  group_by(vis, detected, lineup_id) %>% 
  count() %>% 
  pivot_wider(names_from = detected, values_from = n) %>% 
  group_by(vis, lineup_id) %>% 
  summarise(power = yes/(yes+no)) 

power %>% 
  mutate_if(is.numeric, ~round(., 4)) %>% 
  datatable()
power %>% 
  group_by(vis) %>% 
  summarise(mean = mean(power),
            sd = sd(power)) %>% 
  rename("visualisation" = "vis",
         "Mean of Power" = "mean",
         "Std. Dev. of Power" = "sd") %>% 
  mutate_if(is.numeric, ~round(., 4)) %>% 
  datatable() 

4.3.2 ii. Based on the results, which visual statistic is the most powerful and why?

Based on the results, the violin plot is the most powerful one even the dot plot’s mean of power is slightly higher than violin plot. It is because of violin plot’s lower standard deviation of power which indicates that the actual values of this visualisation is close to the mean. Hence, it is more accurate.